# Dickstein, Ho, and Mark (2023)
# This code creates a cleaned up household (subscriber)-level dataset from the Oregon Health Authority’s APAC dataset
# for the years 2014-2016.

# * # * # * # * # * # * #
# PRELIMINARIES         #
# * # * # * # * # * # * #

setwd("../library")
source("PreliminariesCode.R")

# * # * # * # * # * # * #
# READ IN DERIVED DATA  #
# * # * # * # * # * # * #

## Loading Subscriber Data:
SubDat_List<- list()
for (i in 5:7){
  SubDat_List[[i]] <- fread(paste0("orig/SubData", Shortyear.Vec[i], "7.csv"))
  print(paste0("Before the procedure, there are ", nrow(SubDat_List[[i]]), "observations"))
}
SubDat <- do.call("rbind", SubDat_List[5:7])
is.identified(SubDat, c("subscriberid", "year"))

# Loading Constructed Plan Data
constructed_plan_derived <- fread("orig/ConstructedPlanDat.csv")
constructed_plan_derived <- constructed_plan_derived[year %in% 2014:2016]
is.identified(constructed_plan_derived, c("constrplanid", "year"))

# Loading All Choice Premiums
all_choice_premiums <- fread("AllChoicePremiums.csv")
all_choice_premiums <- all_choice_premiums[year %in% 2014:2016]
is.identified(all_choice_premiums, c("subscriberid", "constructed_plan_year"))

# * # * # * # * # * # * #
# FORMAT DERIVED DATA   #
# * # * # * # * # * # * #

# Adjusting Variables in all_choice_premiums Data
all_choice_premiums[, mnc_plantype := MNC, ]
all_choice_premiums[, constructed_plan_year := paste0(
  substr(Market.Type, 0, 2),
  substr(Exchange, 0, 2),
  ratingarea,
  substr(PAYER_ID, 4, 5),
  metal,
  mnc_plantype,
  "_",
  year), ]

# Adjusting Variables in SubDat Data
SubDat[, mnc_plantype := if_else(best_guess_plantype %in% c("EPO", "HMO"), "MNC", "NOT"), ]
SubDat[, constructed_plan_year := paste0(
  substr(markettype, 0, 2),
  substr(exchange, 0, 2),
  best_guess_ra,
  substr(payer_id, 4, 5),
  best_guess_metal,
  mnc_plantype,
  "_",
  year
), ]

# Merging with all_choice Premiums to get variables on their choice
SubDat <- merge(
  SubDat, 
  all_choice_premiums[, c(
    "subscriberid", "year", "constructed_plan_year", "grossprem", "best_guess_Subsidy", "best_guess_AV", "best_guess_netprem"), with = F],
  by = c("subscriberid", "constructed_plan_year", "year"),
  all.x = T)

print("Formatting Complete")

# * # * # * # * # * # * #
# SUBSETTING            #
# * # * # * # * # * # * #

n_stage <- nrow(SubDat)
names(n_stage) <- "Full"

SubDat <-  SubDat[ (!is.na(constructed_plan_year)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "No Constructed Plan Match"

SubDat <-  SubDat[ (payer_id != "M0099") ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-COOP"

SubDat <-  SubDat[ (!is.na(age)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing Age"

SubDat <-  SubDat[ age >= 20 & age <= 65]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Age in 20-65 Range"

SubDat <- SubDat[ (!is.na(mnc_plantype)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing Managed Care Dummy"

SubDat <- SubDat[ (best_guess_market %in% c(1:2, 4)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Market 1,2,4"

SubDat <- SubDat[ (!is.na(nspouse)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing nspouse"

SubDat <- SubDat[ (!is.na(ndeps)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing ndeps"

SubDat <- SubDat[ (!is.na(sum_concurrent_risk)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing ACG"

SubDat <-  SubDat[ (!is.na(best_guess_incomeoverFPL)) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "Non-Missing Income"

SubDat <-  SubDat[ (best_guess_metal %in% 2:4) ]
n_stage <- c(n_stage, nrow(SubDat))
names(n_stage)[which(names(n_stage) == "")] <- "B,S,G Metal Tier"

print("Subsetting Complete")

# * # * # * # * # * # * #
# CREATE NEW VARIABLES  #
# * # * # * # * # * # * #
income_cutbreaks <- c(0, 2.5, 8)
age_cutbreaks <- c(0, 17, 24, 34, 44, 54, 65, 100)

SubDat[, married := (nspouse > 0), ]
SubDat[, withkids := (ndeps > 0), ]
SubDat[, exog_exchange := substr(exchange, 0, 2), ]
SubDat[, married_kids := if_else(married,
                                 if_else(withkids, 2, 1),
                                 if_else(withkids, 3, 0)), ] # 3 if single+kids,
# 2 if married+kids, 1 if married+nokids, 0 otw
SubDat[, urban := best_guess_ra %in% c(1, 2, 3, 7), ]
SubDat[, portland := best_guess_ra %in% c(1), ]
SubDat[, fpl_bins := cut(as.numeric(best_guess_incomeoverFPL),
                         breaks = income_cutbreaks, label = FALSE), ] # this is a change
SubDat[, fpl_bins_label := cut(as.numeric(best_guess_incomeoverFPL),
                               breaks = income_cutbreaks), ]
SubDat[, age_bins := cut(age, breaks = age_cutbreaks, label = FALSE), ]
SubDat[, age_bins_label := cut(age, breaks = age_cutbreaks), ]
SubDat[, acg_quartiles := quantcut(sum_concurrent_risk, 4, labels=FALSE), ]
SubDat[, acg_quartiles_label := quantcut(sum_concurrent_risk, 4), ]
SubDat[, acg_max_quartiles := quantcut(max_concurrent_risk, 4, labels=FALSE), ]
SubDat[, acg_max_quartiles_label := quantcut(max_concurrent_risk, 4), ]

# Creating the two number of months variables: 
SubDat[, nummonths_observed := nummonthsmode, ]
SubDat[, nummonths_span := lastmonth - firstmonth + 1, ]

# Creating Monthly cost data
SubDat[, totpaidmonth_observed := totpaid / nummonths_observed,]
SubDat[, totcopaymonth_observed := totcopay / nummonths_observed,]
SubDat[, totcoinsmonth_observed := totcoins / nummonths_observed,]
SubDat[, totdeductmonth_observed := totdeduct / nummonths_observed,]
SubDat[, totpaidmonth_span := totpaid / nummonths_span,]
SubDat[, totcopaymonth_span := totcopay / nummonths_span,]
SubDat[, totcoinsmonth_span := totcoins / nummonths_span,]
SubDat[, totdeductmonth_span := totdeduct / nummonths_span,]
print("New Variable Creation Complete")

# * # * # * # * #
# SAVING DATASET#
# * # * # * # * #

write.csv(SubDat, file = "all_markets_subscribers.csv", row.names = F)

# * # * # * # * # * # * # * # * # * # * # * # * #
# ADDING GROUP_ID FOR UNINSURED POPULATION CODE #
# * # * # * # * # * # * # * # * # * # * # * # * #

# IMPORTANT: restrict to individual market: 
SubDat <- SubDat[best_guess_market %in% 1:2]

group_vars <- c(
  "year",
  "best_guess_ra",
  "age_bins",
  "acg_quartiles",
  "fpl_bins",
  "married",
  "withkids")

SubDat[, group_id := .GRP, by = group_vars]
SubDat[, group_size := .N, by = group_vars]

write.csv(SubDat, file = "grouped_subs.csv", row.names = F)

# * # * # * # * # * # * # * # * # * # * # * #
# AGGREGATING TO GET GROUP SIZES AND SAVE   #
# * # * # * # * # * # * # * # * # * # * # * #

GrpDat <- SubDat[, .(group_size = .N), by = c(group_vars, "group_id")]
write.csv(GrpDat, file = "grps.csv", row.names = F)
